Setup library

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(RColorBrewer)
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## 
## 
## Attaching package: 'S4Vectors'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## 
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## 
## Attaching package: 'MatrixGenerics'
## 
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## 
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'Biobase'
## 
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## 
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(pheatmap)
library(DEGreport)
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(edgeR)
## Loading required package: limma
## 
## Attaching package: 'limma'
## 
## The following object is masked from 'package:DESeq2':
## 
##     plotMA
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

Reading in the data

raw_data355 <- read_tsv("3_500_500.tsv", show_col_types = FALSE)
## New names:
## • `` -> `...1`
raw_data372 <- read_tsv("3_750_250.tsv", show_col_types = FALSE)
## New names:
## • `` -> `...1`
raw_data3100 <- read_tsv("3_1000_0.tsv", show_col_types = FALSE)
## New names:
## • `` -> `...1`
raw_data655 <- read_tsv("6_500_500.tsv", show_col_types = FALSE)
## New names:
## • `` -> `...1`
raw_data672 <- read_tsv("6_750_250.tsv", show_col_types = FALSE)
## New names:
## • `` -> `...1`
raw_data6100 <- read_tsv("6_1000_0.tsv", show_col_types = FALSE)
## New names:
## • `` -> `...1`
raw_data955 <- read_tsv("9_500_500.tsv", show_col_types = FALSE)
## New names:
## • `` -> `...1`
raw_data972 <- read_tsv("9_750_250.tsv", show_col_types = FALSE)
## New names:
## • `` -> `...1`
raw_data9100 <- read_tsv("9_1000_0.tsv", show_col_types = FALSE)
## New names:
## • `` -> `...1`

Build a DGEList object

# build a DGEList object from the rawdata count matrix
# "group" variable, factor giving the experimental condition for each sample.
group_condition_3 <- c("condition 1","condition 1","condition 1",
                       "condition 2","condition 2","condition 2")
group_condition_6 <- c("condition 1","condition 1","condition 1", 
                       "condition 1","condition 1","condition 1",
                       "condition 2","condition 2","condition 2",
                       "condition 2","condition 2","condition 2")  
group_condition_9 <- c("condition 1","condition 1","condition 1", 
                       "condition 1","condition 1","condition 1",
                       "condition 1","condition 1","condition 1",
                       "condition 2","condition 2","condition 2",
                       "condition 2","condition 2","condition 2",
                       "condition 2","condition 2","condition 2") 

gene_table_355 <- data.frame(gene = paste('g',rownames(raw_data355),sep=""),
                             stringsAsFactors = FALSE)
gene_table_372 <- data.frame(gene = paste('g',rownames(raw_data372),sep=""),
                             stringsAsFactors = FALSE)
gene_table_3100 <- data.frame(gene = paste('g',rownames(raw_data3100),sep=""),
                             stringsAsFactors = FALSE)

gene_table_655 <- data.frame(gene = paste('g',rownames(raw_data655),sep=""),
                             stringsAsFactors = FALSE)
gene_table_672 <- data.frame(gene = paste('g',rownames(raw_data672),sep=""),
                             stringsAsFactors = FALSE)
gene_table_6100 <- data.frame(gene = paste('g',rownames(raw_data6100),sep=""),
                             stringsAsFactors = FALSE)

gene_table_955 <- data.frame(gene = paste('g',rownames(raw_data955),sep=""),
                             stringsAsFactors = FALSE)
gene_table_972 <- data.frame(gene = paste('g',rownames(raw_data972),sep=""),
                             stringsAsFactors = FALSE)
gene_table_9100 <- data.frame(gene = paste('g',rownames(raw_data9100),sep=""),
                             stringsAsFactors = FALSE)


dge_355 <- DGEList(counts = raw_data355[, 2:7], group = group_condition_3, genes = gene_table_355)
dge_372 <- DGEList(counts = raw_data372[, 2:7], group = group_condition_3, genes = gene_table_372)
dge_3100 <- DGEList(counts = raw_data3100[, 2:7], group = group_condition_3, genes = gene_table_3100)

dge_655 <- DGEList(counts = raw_data655[, 2:13], group = group_condition_6, genes = gene_table_655)
dge_672 <- DGEList(counts = raw_data672[, 2:13], group = group_condition_6, genes = gene_table_672)
dge_6100 <- DGEList(counts = raw_data6100[, 2:13], group = group_condition_6, genes = gene_table_6100)

dge_955 <- DGEList(counts = raw_data955[, 2:19], group = group_condition_9, genes = gene_table_955)
dge_972 <- DGEList(counts = raw_data972[, 2:19], group = group_condition_9, genes = gene_table_972)
dge_9100 <- DGEList(counts = raw_data9100[, 2:19], group = group_condition_9, genes = gene_table_9100)

Filtering

Genes with very low counts across all libraries provide little evidence for differential expression. These genes should be filtered out prior to further analysis.

keep_355 <- filterByExpr(dge_355)
dge_355 <- dge_355[keep_355, , keep.lib.sizes=FALSE]
keep_372 <- filterByExpr(dge_372)
dge_372 <- dge_372[keep_372, , keep.lib.sizes=FALSE]
keep_3100 <- filterByExpr(dge_3100)
dge_3100 <- dge_3100[keep_3100, , keep.lib.sizes=FALSE]

keep_655 <- filterByExpr(dge_655)
dge_655 <- dge_655[keep_655, , keep.lib.sizes=FALSE]
keep_672 <- filterByExpr(dge_672)
dge_672 <- dge_672[keep_672, , keep.lib.sizes=FALSE]
keep_6100 <- filterByExpr(dge_6100)
dge_6100 <- dge_6100[keep_6100, , keep.lib.sizes=FALSE]

keep_955 <- filterByExpr(dge_955)
dge_955 <- dge_955[keep_955, , keep.lib.sizes=FALSE]
keep_972 <- filterByExpr(dge_972)
dge_972 <- dge_972[keep_972, , keep.lib.sizes=FALSE]
keep_9100 <- filterByExpr(dge_9100)
dge_9100 <- dge_9100[keep_9100, , keep.lib.sizes=FALSE]

TMM normalization

Once a DGEList has been created, we calculate between-sample (TMM) normalization factors, using the calcNormFactors function in edgeR.

# normalization
dge_355 <- calcNormFactors(dge_355)
dge_355$samples
##               group lib.size norm.factors
## sample1 condition 1  9936563    1.0204249
## sample2 condition 1 13153258    1.0263423
## sample3 condition 1 11093285    1.0263975
## sample4 condition 2  8765931    0.9574026
## sample5 condition 2 12273264    0.9847761
## sample6 condition 2  6926265    0.9866862
dge_372 <- calcNormFactors(dge_372)
dge_372$samples
##               group lib.size norm.factors
## sample1 condition 1 11052851    1.0418132
## sample2 condition 1 11092133    1.0300592
## sample3 condition 1 12071061    1.0419756
## sample4 condition 2  8163997    0.9536709
## sample5 condition 2 12125226    0.9530166
## sample6 condition 2  9953982    0.9839916
dge_3100 <- calcNormFactors(dge_3100)
dge_3100$samples
##               group lib.size norm.factors
## sample1 condition 1 11728979    1.0672315
## sample2 condition 1 11288644    1.0358057
## sample3 condition 1 10492645    1.0400344
## sample4 condition 2 10441968    0.9575966
## sample5 condition 2  7883293    0.9613849
## sample6 condition 2 11836049    0.9447904
dge_655 <- calcNormFactors(dge_655)
dge_655$samples
##                group lib.size norm.factors
## sample1  condition 1  9367027    1.0488148
## sample2  condition 1 10688378    0.9882631
## sample3  condition 1 11411420    1.0012047
## sample4  condition 1 12077666    1.0260231
## sample5  condition 1 11574086    1.0310709
## sample6  condition 1 14454067    0.9832942
## sample7  condition 2 11278656    0.9920195
## sample8  condition 2 11469685    1.0113257
## sample9  condition 2 10039344    1.0021237
## sample10 condition 2 13242544    0.9471635
## sample11 condition 2 13355462    0.9818416
## sample12 condition 2  7087303    0.9907809
dge_672 <- calcNormFactors(dge_672)
dge_672$samples
##                group lib.size norm.factors
## sample1  condition 1  8453974    1.0125115
## sample2  condition 1  9697574    1.0148064
## sample3  condition 1  8224095    1.0260828
## sample4  condition 1  8150458    1.0300246
## sample5  condition 1  9126596    1.0537170
## sample6  condition 1  9239506    1.0483004
## sample7  condition 2  8631206    0.9801837
## sample8  condition 2  9211138    0.9562534
## sample9  condition 2  9950734    0.9646018
## sample10 condition 2  9592038    0.9542121
## sample11 condition 2 12538572    0.9744893
## sample12 condition 2 11415450    0.9915774
dge_6100 <- calcNormFactors(dge_6100)
dge_6100$samples
##                group lib.size norm.factors
## sample1  condition 1  9968884    1.0379582
## sample2  condition 1  9984227    1.0424475
## sample3  condition 1 10688642    1.0225005
## sample4  condition 1 14077440    1.0075952
## sample5  condition 1 10415180    1.0356272
## sample6  condition 1 11292813    1.0390788
## sample7  condition 2 12182179    0.9713173
## sample8  condition 2 11014085    0.9644529
## sample9  condition 2 13316384    0.9779047
## sample10 condition 2 12169138    0.9804631
## sample11 condition 2  8879064    0.9699505
## sample12 condition 2  8186285    0.9568523
dge_955 <- calcNormFactors(dge_955)
dge_955$samples
##                group lib.size norm.factors
## sample1  condition 1 12599528    1.0326722
## sample2  condition 1  8341146    1.0123499
## sample3  condition 1 12108225    1.0072010
## sample4  condition 1 11232575    1.0110704
## sample5  condition 1 10368372    1.0237936
## sample6  condition 1 12860049    1.0334680
## sample7  condition 1  8449147    1.0107370
## sample8  condition 1  9499399    1.0390927
## sample9  condition 1  8265931    1.0252758
## sample10 condition 2 10983053    0.9827603
## sample11 condition 2  7535175    0.9967394
## sample12 condition 2 10813105    1.0098507
## sample13 condition 2  9537814    0.9292800
## sample14 condition 2 10659751    0.9773668
## sample15 condition 2 12760433    1.0096787
## sample16 condition 2  8288705    0.9652265
## sample17 condition 2 10252760    0.9949266
## sample18 condition 2 11757532    0.9463954
dge_972 <- calcNormFactors(dge_972)
dge_972$samples
##                group lib.size norm.factors
## sample1  condition 1 13198205    1.0448570
## sample2  condition 1  7218350    1.0822290
## sample3  condition 1 10713463    1.0232390
## sample4  condition 1 11438368    1.0079703
## sample5  condition 1 12684194    1.0245679
## sample6  condition 1 10723228    1.0158516
## sample7  condition 1 11352651    0.9912454
## sample8  condition 1  7163783    1.0539153
## sample9  condition 1  9735525    1.0460639
## sample10 condition 2  7308354    0.9746116
## sample11 condition 2 10661454    0.9555166
## sample12 condition 2 10474575    0.9724643
## sample13 condition 2 14143690    0.9685194
## sample14 condition 2 10884146    0.9729029
## sample15 condition 2  8559671    0.9858265
## sample16 condition 2 12813821    0.9581131
## sample17 condition 2 11885467    0.9682145
## sample18 condition 2 12655347    0.9659908
dge_9100 <- calcNormFactors(dge_9100)
dge_9100$samples
##                group lib.size norm.factors
## sample1  condition 1  8540209    1.0692016
## sample2  condition 1 11060402    1.0432528
## sample3  condition 1  7535836    1.0687515
## sample4  condition 1 13214949    1.0550496
## sample5  condition 1 10740491    1.0221283
## sample6  condition 1  7771948    1.0214377
## sample7  condition 1 13882262    1.0355561
## sample8  condition 1 12204538    1.0415055
## sample9  condition 1  8394200    1.0598920
## sample10 condition 2  9652389    0.9536747
## sample11 condition 2  7159394    0.9304488
## sample12 condition 2  8533615    0.9500831
## sample13 condition 2 10358445    0.9589611
## sample14 condition 2  8157428    0.9658728
## sample15 condition 2  7729941    0.9517153
## sample16 condition 2 13722141    0.9632292
## sample17 condition 2 12849219    0.9643829
## sample18 condition 2  7466519    0.9649962

Data exploration

An MDS plot shows the relative similarities of the samples.

edgeR contains a function plotMDS, which operates on a DGEList object and generates a two-dimensional MDS representation of the samples. The default distance between two samples can be interpreted as the “typical” log fold change between the two samples, for the genes that are most different between them

# plotMDS
p355 <- plotMDS(dge_355, top = 1000, labels = NULL, 
                col = as.numeric(dge_355$samples$group), cex = 0.6)

#legend("bottomright", as.character(unique(dge$samples$group)), col = 1:2, pch = 19)
p372 <- plotMDS(dge_372, top = 1000, labels = NULL, 
                col = as.numeric(dge_372$samples$group), cex = 0.6)

p3100 <- plotMDS(dge_3100, top = 1000, labels = NULL, 
                col = as.numeric(dge_3100$samples$group), cex = 0.6)

p655 <- plotMDS(dge_655, top = 1000, labels = NULL, 
                col = as.numeric(dge_655$samples$group), cex = 0.6)

p672 <- plotMDS(dge_672, top = 1000, labels = NULL, 
                col = as.numeric(dge_672$samples$group), cex = 0.6)

p6100 <- plotMDS(dge_6100, top = 1000, labels = NULL, 
                col = as.numeric(dge_6100$samples$group), cex = 0.6)

p955 <- plotMDS(dge_955, top = 1000, labels = NULL, 
                col = as.numeric(dge_955$samples$group), cex = 0.6)

p972 <- plotMDS(dge_972, top = 1000, labels = NULL, 
                col = as.numeric(dge_972$samples$group), cex = 0.6)

p9100 <- plotMDS(dge_9100, top = 1000, labels = NULL, 
                col = as.numeric(dge_9100$samples$group), cex = 0.6)

figure <- ggarrange(p355, p372, p3100,
                    p655, p672, p6100,
                    p955, p972, p9100,
                    labels = c("3_500_500", "3_750_250", "3_1000_0", 
                               "6_500_500", "6_750_250", "6_1000_0", 
                               "9_500_500", "9_750_250", "9_1000_0"),
                    ncol = 3, nrow = 3)
## Warning in as_grob.default(plot): Cannot convert object of class MDS into a
## grob.

## Warning in as_grob.default(plot): Cannot convert object of class MDS into a
## grob.

## Warning in as_grob.default(plot): Cannot convert object of class MDS into a
## grob.

## Warning in as_grob.default(plot): Cannot convert object of class MDS into a
## grob.

## Warning in as_grob.default(plot): Cannot convert object of class MDS into a
## grob.

## Warning in as_grob.default(plot): Cannot convert object of class MDS into a
## grob.

## Warning in as_grob.default(plot): Cannot convert object of class MDS into a
## grob.

## Warning in as_grob.default(plot): Cannot convert object of class MDS into a
## grob.

## Warning in as_grob.default(plot): Cannot convert object of class MDS into a
## grob.
annotate_figure(figure, top = text_grob("A two-dimensional MDS representation of the samples"))

annotate_figure
## function (p, top = NULL, bottom = NULL, left = NULL, right = NULL, 
##     fig.lab = NULL, fig.lab.pos = c("top.left", "top", "top.right", 
##         "bottom.left", "bottom", "bottom.right"), fig.lab.size, 
##     fig.lab.face) 
## {
##     fig.lab.pos <- match.arg(fig.lab.pos)
##     annot.args <- list(top = top, bottom = bottom, left = left, 
##         right = right) %>% .compact()
##     lab.args <- list(label = fig.lab, position = fig.lab.pos) %>% 
##         .compact()
##     if (!missing(fig.lab.size)) 
##         lab.args$size <- fig.lab.size
##     if (!missing(fig.lab.face)) 
##         lab.args$fontface <- fig.lab.face
##     if (!.is_empty(annot.args)) {
##         p <- gridExtra::arrangeGrob(p, top = top, bottom = bottom, 
##             left = left, right = right) %>% as_ggplot()
##     }
##     if (!is.null(fig.lab)) {
##         p <- cowplot::ggdraw(p) + do.call(cowplot::draw_figure_label, 
##             lab.args)
##     }
##     p
## }
## <bytecode: 0x7fc615632be0>
## <environment: namespace:ggpubr>

Differential expression analysis

Next we will show how to perform differential expression analysis with edgeR. Recall that we have a DGEList (dge), containing all the necessary information:

The classic edgeR pipeline: pairwise comparisons between two or more groups

  • Estimating dispersions
dge_355 <- estimateDisp(dge_355)
## Using classic mode.
plotBCV(dge_355)

dge_372 <- estimateDisp(dge_372)
## Using classic mode.
plotBCV(dge_372)

dge_3100 <- estimateDisp(dge_3100)
## Using classic mode.
plotBCV(dge_3100)

dge_655 <- estimateDisp(dge_655)
## Using classic mode.
plotBCV(dge_655)

dge_672 <- estimateDisp(dge_672)
## Using classic mode.
plotBCV(dge_672)

dge_6100 <- estimateDisp(dge_6100)
## Using classic mode.
plotBCV(dge_6100)

dge_955 <- estimateDisp(dge_955)
## Using classic mode.
plotBCV(dge_955)

dge_972 <- estimateDisp(dge_972)
## Using classic mode.
plotBCV(dge_972)

dge_9100 <- estimateDisp(dge_9100)
## Using classic mode.
plotBCV(dge_9100)

  • Testing for DE genes

For all the Next-Gen squencing data analyses we consider here, people are most interested in finding differentially expressed genes/tags between two (or more) groups. Once negative binomial models are fitted and dispersion estimates are obtained, we can proceed with testing procedures for determining differential expression using the exact test.

The exact test is based on the qCML methods. Knowing the conditional distribution for the sum of counts in a group, we can compute exact p-values by summing over all sums of counts that have a probability less than the probability under the null hypothesis of the observed sum of counts. The exact test for the negative binomial distribution has strong parallels with Fisher’s exact test.

As we dicussed in the previous section, the exact test is only applicable to experiments with a single factor. The testing can be done by using the function exactTest(), and the function allows both common dispersion and tagwise dispersion approaches.

# exactTest()
et_355 <- exactTest(dge_355)
et_372 <- exactTest(dge_372)
et_3100 <- exactTest(dge_3100)

et_655 <- exactTest(dge_655)
et_672 <- exactTest(dge_672)
et_6100 <- exactTest(dge_6100)

et_955 <- exactTest(dge_955)
et_972 <- exactTest(dge_972)
et_9100 <- exactTest(dge_9100)

# topTags()
df_export_355 <- topTags(et_355, n = 1000, p.value = 0.05)
df_export_372 <- topTags(et_372, n = 1000, p.value = 0.05)
df_export_3100 <- topTags(et_3100, n = 1000, p.value = 0.05)

df_export_655 <- topTags(et_655, n = 1000, p.value = 0.05)
df_export_672 <- topTags(et_672, n = 1000, p.value = 0.05)
df_export_6100 <- topTags(et_6100, n = 1000, p.value = 0.05)

df_export_955 <- topTags(et_955, n = 1000, p.value = 0.05)
df_export_972 <- topTags(et_972, n = 1000, p.value = 0.05)
df_export_9100 <- topTags(et_9100, n = 1000, p.value = 0.05)

# export .csv files
write.csv(df_export_355,"V2_EdgeR_3_500_500.csv", row.names = FALSE)
write.csv(df_export_372,"V2_EdgeR_3_750_250.csv", row.names = FALSE)
write.csv(df_export_3100,"V2_EdgeR_3_1000_0.csv", row.names = FALSE)

write.csv(df_export_655,"V2_EdgeR_6_500_500.csv", row.names = FALSE)
write.csv(df_export_672,"V2_EdgeR_6_750_250.csv", row.names = FALSE)
write.csv(df_export_6100,"V2_EdgeR_6_1000_0.csv", row.names = FALSE)

write.csv(df_export_955,"V2_EdgeR_9_500_500.csv", row.names = FALSE)
write.csv(df_export_972,"V2_EdgeR_9_750_250.csv", row.names = FALSE)
write.csv(df_export_9100,"V2_EdgeR_9_1000_0.csv", row.names = FALSE)

# combind table
et_355$table <- cbind(et_355$genes,et_355$table)
et_372$table <- cbind(et_372$genes,et_372$table)
et_3100$table <- cbind(et_3100$genes,et_3100$table)

et_655$table <- cbind(et_655$genes,et_655$table)
et_672$table <- cbind(et_672$genes,et_672$table)
et_6100$table <- cbind(et_6100$genes,et_6100$table)

et_955$table <- cbind(et_955$genes,et_955$table)
et_972$table <- cbind(et_972$genes,et_972$table)
et_9100$table <- cbind(et_9100$genes,et_9100$table)

The total number of differentially expressed genes at 5% FDR is given by:

# default: decideTests(object, adjust.method="BH", p.value = 0.05, lfc = 0, ...)
result_decideTest_355 <- decideTests(et_355, lfc = 0.58)
result_decideTest_372 <- decideTests(et_372, lfc = 0.58)
result_decideTest_3100 <- decideTests(et_3100, lfc = 0.58)

result_decideTest_655 <- decideTests(et_655, lfc = 0.58)
result_decideTest_672 <- decideTests(et_672, lfc = 0.58)
result_decideTest_6100 <- decideTests(et_6100, lfc = 0.58)

result_decideTest_955 <- decideTests(et_955, lfc = 0.58)
result_decideTest_972 <- decideTests(et_972, lfc = 0.58)
result_decideTest_9100 <- decideTests(et_9100, lfc = 0.58)

#summary(result_decideTest_655)
# add the new column, which is the output of decideTest, into genes dataframe
et_355$table <- cbind(et_355$table,data.frame(result_decideTest_355@.Data))
# add the diffexpressed collumn
et_355$table$diffexpressed <- "NotSig"
# if condition.2.condition.1 == -1 set as "Down"
et_355$table$diffexpressed[et_355$table$condition.2.condition.1 == -1] <- "Down"
# if condition.2.condition.1 == 1 set as "Up" 
et_355$table$diffexpressed[et_355$table$condition.2.condition.1 == 1] <- "Up"
# if condition.2.condition.1 == 0 set as "NotSig" 
et_355$table$diffexpressed[et_355$table$condition.2.condition.1 == 0] <- "NotSig"
#head(et$table)

et_372$table <- cbind(et_372$table,data.frame(result_decideTest_372@.Data))
et_372$table$diffexpressed <- "NotSig"
et_372$table$diffexpressed[et_372$table$condition.2.condition.1 == -1] <- "Down"
et_372$table$diffexpressed[et_372$table$condition.2.condition.1 == 1] <- "Up"
et_372$table$diffexpressed[et_372$table$condition.2.condition.1 == 0] <- "NotSig"

et_3100$table <- cbind(et_3100$table,data.frame(result_decideTest_3100@.Data))
et_3100$table$diffexpressed <- "NotSig"
et_3100$table$diffexpressed[et_3100$table$condition.2.condition.1 == -1] <- "Down"
et_3100$table$diffexpressed[et_3100$table$condition.2.condition.1 == 1] <- "Up"
et_3100$table$diffexpressed[et_3100$table$condition.2.condition.1 == 0] <- "NotSig"

# _6
et_655$table <- cbind(et_655$table,data.frame(result_decideTest_655@.Data))
et_655$table$diffexpressed <- "NotSig"
et_655$table$diffexpressed[et_655$table$condition.2.condition.1 == -1] <- "Down"
et_655$table$diffexpressed[et_655$table$condition.2.condition.1 == 1] <- "Up"
et_655$table$diffexpressed[et_655$table$condition.2.condition.1 == 0] <- "NotSig"

et_672$table <- cbind(et_672$table,data.frame(result_decideTest_672@.Data))
et_672$table$diffexpressed <- "NotSig"
et_672$table$diffexpressed[et_672$table$condition.2.condition.1 == -1] <- "Down"
et_672$table$diffexpressed[et_672$table$condition.2.condition.1 == 1] <- "Up"
et_672$table$diffexpressed[et_672$table$condition.2.condition.1 == 0] <- "NotSig"

et_6100$table <- cbind(et_6100$table,data.frame(result_decideTest_6100@.Data))
et_6100$table$diffexpressed <- "NotSig"
et_6100$table$diffexpressed[et_6100$table$condition.2.condition.1 == -1] <- "Down"
et_6100$table$diffexpressed[et_6100$table$condition.2.condition.1 == 1] <- "Up"
et_6100$table$diffexpressed[et_6100$table$condition.2.condition.1 == 0] <- "NotSig"

# _9
et_955$table <- cbind(et_955$table,data.frame(result_decideTest_955@.Data))
et_955$table$diffexpressed <- "NotSig"
et_955$table$diffexpressed[et_955$table$condition.2.condition.1 == -1] <- "Down"
et_955$table$diffexpressed[et_955$table$condition.2.condition.1 == 1] <- "Up"
et_955$table$diffexpressed[et_955$table$condition.2.condition.1 == 0] <- "NotSig"

et_972$table <- cbind(et_972$table,data.frame(result_decideTest_972@.Data))
et_972$table$diffexpressed <- "NotSig"
et_972$table$diffexpressed[et_972$table$condition.2.condition.1 == -1] <- "Down"
et_972$table$diffexpressed[et_972$table$condition.2.condition.1 == 1] <- "Up"
et_972$table$diffexpressed[et_972$table$condition.2.condition.1 == 0] <- "NotSig"

et_9100$table <- cbind(et_9100$table,data.frame(result_decideTest_9100@.Data))
et_9100$table$diffexpressed <- "NotSig"
et_9100$table$diffexpressed[et_9100$table$condition.2.condition.1 == -1] <- "Down"
et_9100$table$diffexpressed[et_9100$table$condition.2.condition.1 == 1] <- "Up"
et_9100$table$diffexpressed[et_9100$table$condition.2.condition.1 == 0] <- "NotSig"

Plot log-fold change against log-counts per million, with DE genes highlighted:

#plotMD(et)
#abline(h=c(-1, 1), col="blue")

The blue lines indicate 2-fold changes.

Volcano plot

arrange_et_355 <- et_355
arrange_et_372 <- et_372
arrange_et_3100 <- et_3100

arrange_et_655 <- et_655
arrange_et_672 <- et_672
arrange_et_6100 <- et_6100

arrange_et_955 <- et_955
arrange_et_972 <- et_972
arrange_et_9100 <- et_9100

# arrange the et by using p-value (ascending order). Actually, we identify the differential expression genes by using FDR or adjusted p-value.
arrange_et_355$table <- arrange_et_355$table %>% arrange(PValue) %>% mutate(genelabels = "")
arrange_et_355$table$genelabels[1:10] <- arrange_et_355$table$gene[1:10]
arrange_et_372$table <- arrange_et_372$table %>% arrange(PValue) %>% mutate(genelabels = "")
arrange_et_372$table$genelabels[1:10] <- arrange_et_372$table$gene[1:10]
arrange_et_3100$table <- arrange_et_3100$table %>% arrange(PValue) %>% mutate(genelabels = "")
arrange_et_3100$table$genelabels[1:10] <- arrange_et_3100$table$gene[1:10]

arrange_et_655$table <- arrange_et_655$table %>% arrange(PValue) %>% mutate(genelabels = "")
arrange_et_655$table$genelabels[1:10] <- arrange_et_655$table$gene[1:10]
arrange_et_672$table <- arrange_et_672$table %>% arrange(PValue) %>% mutate(genelabels = "")
arrange_et_672$table$genelabels[1:10] <- arrange_et_672$table$gene[1:10]
arrange_et_6100$table <- arrange_et_6100$table %>% arrange(PValue) %>% mutate(genelabels = "")
arrange_et_6100$table$genelabels[1:10] <- arrange_et_6100$table$gene[1:10]

arrange_et_955$table <- arrange_et_955$table %>% arrange(PValue) %>% mutate(genelabels = "")
arrange_et_955$table$genelabels[1:10] <- arrange_et_955$table$gene[1:10]
arrange_et_972$table <- arrange_et_972$table %>% arrange(PValue) %>% mutate(genelabels = "")
arrange_et_972$table$genelabels[1:10] <- arrange_et_972$table$gene[1:10]
arrange_et_9100$table <- arrange_et_9100$table %>% arrange(PValue) %>% mutate(genelabels = "")
arrange_et_9100$table$genelabels[1:10] <- arrange_et_9100$table$gene[1:10]
# _3
volca_355 <- ggplot(data=arrange_et_355$table, aes(x=arrange_et_355$table$logFC, 
                          y=-log10(arrange_et_355$table$PValue))) + 
  geom_point(aes(col=diffexpressed)) + 
  theme_minimal() + 
  geom_text_repel(aes(label=genelabels), max.overlaps = getOption("ggrepel.max.overlaps", default = 10)) + 
  scale_color_manual(values=c("blue", "black", "red")) +
  geom_vline(xintercept=c(0, 0), col="red") +
  geom_hline(yintercept=-log10(1.377256*10^-3), col="red") +
  ggtitle("3_500_500 differential expression") +
  xlab("log2 fold change") +
  ylab("-log10 p-value")

volca_372 <- ggplot(data=arrange_et_372$table, aes(x=arrange_et_372$table$logFC, 
                          y=-log10(arrange_et_372$table$PValue))) + 
  geom_point(aes(col=diffexpressed)) + 
  theme_minimal() + 
  geom_text_repel(aes(label=genelabels), max.overlaps = getOption("ggrepel.max.overlaps", default = 10)) + 
  scale_color_manual(values=c("blue", "black", "red")) +
  geom_vline(xintercept=c(0, 0), col="red") +
  geom_hline(yintercept=-log10(1.498174*10^-3), col="red") +
  ggtitle("3_750_250 differential expression") +
  xlab("log2 fold change") +
  ylab("-log10 p-value")

volca_3100 <- ggplot(data=arrange_et_3100$table, aes(x=arrange_et_3100$table$logFC, 
                          y=-log10(arrange_et_3100$table$PValue))) + 
  geom_point(aes(col=diffexpressed)) + 
  theme_minimal() + 
  geom_text_repel(aes(label=genelabels), max.overlaps = getOption("ggrepel.max.overlaps", default = 10)) + 
  scale_color_manual(values=c("blue", "black", "red")) +
  geom_vline(xintercept=c(0, 0), col="red") +
  geom_hline(yintercept=-log10(1.590792*10^-3), col="red") +
  ggtitle("3_1000_0 differential expression") +
  xlab("log2 fold change") +
  ylab("-log10 p-value")

# _6
volca_655 <- ggplot(data=arrange_et_655$table, aes(x=arrange_et_655$table$logFC, 
                          y=-log10(arrange_et_655$table$PValue))) + 
  geom_point(aes(col=diffexpressed)) + 
  theme_minimal() + 
  geom_text_repel(aes(label=genelabels), max.overlaps = getOption("ggrepel.max.overlaps", default = 10)) + 
  scale_color_manual(values=c("blue", "black", "red")) +
  geom_vline(xintercept=c(0, 0), col="red") +
  geom_hline(yintercept=-log10(2.777723*10^-3), col="red") +
  ggtitle("6_500_500 differential expression") +
  xlab("log2 fold change") +
  ylab("-log10 p-value")

volca_672 <- ggplot(data=arrange_et_672$table, aes(x=arrange_et_672$table$logFC, 
                          y=-log10(arrange_et_672$table$PValue))) + 
  geom_point(aes(col=diffexpressed)) + 
  theme_minimal() + 
  geom_text_repel(aes(label=genelabels), max.overlaps = getOption("ggrepel.max.overlaps", default = 10)) + 
  scale_color_manual(values=c("blue", "black", "red")) +
  geom_vline(xintercept=c(0, 0), col="red") +
  geom_hline(yintercept=-log10(2.953149*10^-3), col="red") +
  ggtitle("6_750_250 differential expression") +
  xlab("log2 fold change") +
  ylab("-log10 p-value")

volca_6100 <- ggplot(data=arrange_et_6100$table, aes(x=arrange_et_6100$table$logFC, 
                          y=-log10(arrange_et_6100$table$PValue))) + 
  geom_point(aes(col=diffexpressed)) + 
  theme_minimal() + 
  geom_text_repel(aes(label=genelabels), max.overlaps = getOption("ggrepel.max.overlaps", default = 10)) + 
  scale_color_manual(values=c("blue", "black", "red")) +
  geom_vline(xintercept=c(0, 0), col="red") +
  geom_hline(yintercept=-log10(2.922512*10^-3), col="red") +
  ggtitle("6_1000_0 differential expression") +
  xlab("log2 fold change") +
  ylab("-log10 p-value")

#_9
volca_955 <- ggplot(data=arrange_et_955$table, aes(x=arrange_et_955$table$logFC, 
                          y=-log10(arrange_et_955$table$PValue))) + 
  geom_point(aes(col=diffexpressed)) + 
  theme_minimal() + 
  geom_text_repel(aes(label=genelabels), max.overlaps = getOption("ggrepel.max.overlaps", default = 10)) + 
  scale_color_manual(values=c("blue", "black", "red")) +
  geom_vline(xintercept=c(0, 0), col="red") +
  geom_hline(yintercept=-log10(3.557585*10^-3), col="red") +
  ggtitle("9_500_500 differential expression") +
  xlab("log2 fold change") +
  ylab("-log10 p-value")

volca_972 <- ggplot(data=arrange_et_972$table, aes(x=arrange_et_972$table$logFC, 
                          y=-log10(arrange_et_972$table$PValue))) + 
  geom_point(aes(col=diffexpressed)) + 
  theme_minimal() + 
  geom_text_repel(aes(label=genelabels), max.overlaps = getOption("ggrepel.max.overlaps", default = 10)) + 
  scale_color_manual(values=c("blue", "black", "red")) +
  geom_vline(xintercept=c(0, 0), col="red") +
  geom_hline(yintercept=-log10(3.600677*10^-3), col="red") +
  ggtitle("9_750_250 differential expression") +
  xlab("log2 fold change") +
  ylab("-log10 p-value")

volca_9100 <- ggplot(data=arrange_et_9100$table, aes(x=arrange_et_9100$table$logFC, 
                          y=-log10(arrange_et_9100$table$PValue))) + 
  geom_point(aes(col=diffexpressed)) + 
  theme_minimal() + 
  geom_text_repel(aes(label=genelabels), max.overlaps = getOption("ggrepel.max.overlaps", default = 10)) + 
  scale_color_manual(values=c("blue", "black", "red")) +
  geom_vline(xintercept=c(0, 0), col="red") +
  geom_hline(yintercept=-log10(3.322350*10^-3), col="red") +
  ggtitle("9_1000_0 differential expression") +
  xlab("log2 fold change") +
  ylab("-log10 p-value")

figure2 <- ggarrange(volca_355, volca_372, volca_3100,
                    volca_655, volca_672, volca_6100,
                    volca_955, volca_972, volca_9100,
                    labels = c("A", "B", "C", 
                               "D", "E", "F",
                               "G", "H", "I"),
                    ncol = 3, nrow = 3)
figure2
## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## ggrepel: 9 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

This volcano plot represents differential expression of genes. On the x-axis we typically find the fold change and on the y-axis the p-value.